from lxml import etree
import cssselect
# # lxml 是xml和html解析的库
# text = """<!DOCTYPE html>
# <html lang="en">
# <head>
#     <meta charset="UTF-8">
#     <title>Title</title>
# </head>
# <body>
#      <div class="element">
#          <ul>
#              <li class="li1">li</li>
#              <li class="li2">li</li>
#              <li class="li3">li</li>
#          </ul>
#          <ul>
#              <li class="li1">li</li>
#              <li class="li2">li</li>
#              <li class="li3">li</li>
#          </ul>
#      </div>
# </body>
# </html>"""
#
# tree = etree.fromstring(text, parser=etree.HTMLParser())
# print(tree)
# print(type(tree))
# print(dir(tree))
# items = tree.findall('body/div/ul/li')
# for item in items:
#     print(item.text)






text = """"<!DOCTYPE html>
<html lang="en">
<head>
    <meta charset="UTF-8">
    <title>Title</title>
</head>
<body>
     <div class="element">
         <ul>
             <li class="li1"><a href="https://www.baidu.com">百度</a></li>
             <li class="li2"><a href="https://www.xiaomi.com">小米</a></li>
             <li class="li3"><a href="https://www.huawei.com">华为</a></li>
         </ul>
         <ul>
             <li class="li11"><a href="https://www.baidu.com">百度</a></li>
             <li class="li22"><a href="https://www.xiaomi.com">小米</a></li>
             <li class="li33"><a href="https://www.huawei.com">华为</a></li>
         </ul>
     </div>
</body>
</html>"""

tree = etree.HTML(text)


# titles = tree.cssselect('title')
# for title in titles:
#     print(title.text)
# print('============')
# lis = tree.cssselect('li')
# for li in lis:
#     print(li.text, li.attrib, li.tag)
# print('============')
# lis = tree.cssselect('ul li')
# for li in lis:
#     print(li.text, li.attrib, li.tag)
#     a = li.cssselect('a')[0]
#     print(a.text, a.attrib, a.tag)


body = tree.xpath('./body')
print(body)

div = tree.xpath('./body/div[@class="element"]')
print(div)

lis = tree.xpath('//div/li')
for li in lis:
    print(li.attrib)
print("------------")
lis = tree.xpath('//div/li')
for li in lis:
    print(li.attrib)
print("------------")
lis = tree.xpath('//div//li')
for li in lis:
    print(li.attrib)



















